# 日期：2021年8月7日
# 作者：鲍鑫涛
# 名称：笔趣阁爬虫
import time

import requests

from lxml import etree

# 定义爬取范围
BASE_DOMAIN = 'https://www.xbiquge.la'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
START_URLS = 'https://www.xbiquge.la/xiaoshuodaquan/'


def parse_page(url):
    resp = requests.get(url, headers=HEADERS)
    text = resp.content.decode('utf-8')
    tree = etree.HTML(text)
    return tree


# 获取小说名字以及章节的url
def run(url):
    tree = parse_page(url)
    names = tree.xpath("//div[@id='main']//ul/li//text()")
    urls = tree.xpath("//div[@id='main']//ul/li//@href")
    for i in range(len(names)):
        print(names[i], [urls[i]])
        get_chap(urls[i])
        time.sleep(1)


def get_chap(url):
    tree = parse_page(url)
    names = tree.xpath("//dl//dd//text()")
    urls = tree.xpath("//dl//dd//@href")
    for i in range(len(names)):
        urls[i] = BASE_DOMAIN + urls[i]
        print(names[i], urls[i])
        get_content(urls[i])
        time.sleep(1)


def get_content(url):
    tree = parse_page(url)
    contents = tree.xpath("//div[@id='content']/text()")
    # 去除爬取内容中含有空白元素以及转义字符
    contents = [i.strip() for i in contents if str(i).strip() != ""]
    content = "".join(contents)
    print(content)


if __name__ == '__main__':
    run(START_URLS)
